# -- encoding:utf-8 --
import openpyxl
import re
import csv
import jieba

# 清洗字符串，字符切分
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.compile(' ').sub('', string)  # 去掉中间空格

    string = re.sub(r"[^\u4e00-\u9fa5A-Za-z0-9(),.!?，。？！、“”\'\`]", " ", string)  # 考虑到中文
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s+", " ", string)
    return string.strip()

rb = openpyxl.load_workbook('./building_data/原始数据.xlsx')
sheet = rb.get_sheet_by_name('镇区盘源-最新盘源 (2)')
h_id = 4

name_list = []
address_list = []
commission_list = []
while h_id <= 457:
    # 地区名
    address = sheet['D' + str(h_id)].value
    if address:
        clean_str(address)
        address_list.append(address)
    # 佣金
    # commission = sheet['J' + str(h_id)].value
    # if commission:
    #     clean_str(commission)
    #     commission_list.append(commission)
    # 业主名
    name = sheet['K' + str(h_id)].value
    if name:
        clean_str(name)
        name_list.append(name)
    h_id += 1

# 地区名
address_list = set(address_list)
# print(commission_list)
bz_address_list = []
for address in address_list:
    # print(jieba.lcut(address))
    bz_address_list.append([address, 'DQM'])

# 佣金
# commission_list = set(commission_list)
# # print(commission_list)
# bz_commission_list = []
# for commission in commission_list:
#     # print(jieba.lcut(commission))
#     bz_commission_list.append([commission, 'YJ'])

# 业主名
name_list = set(name_list)
# print(name_list)
bz_name_list = []
for name in name_list:
    # print(jieba.lcut(name))
    bz_name_list.append([name, 'YZN'])


with open('DICT_NOW.csv', 'a', newline='') as f:
    rows = bz_address_list + bz_name_list
    writer = csv.writer(f)
    # 写入多行数据
    writer.writerows(rows)



